In this document, we will analyse DESeq2 results via a functional enrichment analysis
Set working directory and load libraries
#setwd("~/Desktop/rnaseq/")
setwd("~/Documents/students_MSc/clara/rnaseq/")
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice() masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gprofiler2)
Let’s load the results files we need.
# the list of DEG results from the previous exercises
res<-readRDS("./results/deseq2_results.rds")
# the annotations
xtrop<-read_csv("./xtr109/diamondblast109.csv")
## Rows: 24392 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): gene_id, transcript_id, peptide_id, xenx_pep_id, xenx_gene_symbol, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Ideally, the annotations of your genes should come from experimental evidence from your organism. If you work with mice, Drosophila or humans for example, many functional enrichment analysis tools will be very easy to implement because they will automatically connect your lists of genes to annotations, background lists etc. This is unlikely going to exist if you work with non-model systems. We are therefore dependent on making our own annotations and lists.
For our annotations, we are using BLAST results from querying our genes against the proteome of Xenopus tropicalis. This is a well studied frog species. It is still only distantly related to our focal species, but this is the best we can do. As we saw in the previous exercise, this results in many genes not having any annotations. These will be excluded unfortunately.
pull out DEGs and convert to xenopus IDs
pull_genes<-function(x, alpha=0.05, lfc=0, signed="both", feature="xenp_gene_symbol") {
# filter by adjusted p
x<-x %>%
filter(padj<alpha)
if(signed=="up"){
x<-x %>%
filter(log2FoldChange>lfc) %>%
pull(feature)
}
if(signed=="down"){
x<-x %>%
filter(log2FoldChange<(lfc*-1)) %>%
pull(feature)
}
if(signed=="both"){
x<-x %>%
filter(abs(log2FoldChange)>lfc) %>%
pull(feature)
}
## deal with multiple gene annotations for the same gene (different transcripts)
x<-strsplit(x, ";") %>% unlist() %>% unique()
x<-x[!is.na(x)]
return(x[!is.na(x)])
}
# now extract all
sig_deg<-lapply(res, FUN=function(x)
x %>%
as_tibble(rownames = "gene_id") %>%
left_join(xtrop) %>%
pull_genes(feature = "xenp_pep_id", alpha = 0.05, lfc=0, signed = "both")
)
## Joining with `by = join_by(gene_id)`
## Joining with `by = join_by(gene_id)`
## Joining with `by = join_by(gene_id)`
## Joining with `by = join_by(gene_id)`
## Joining with `by = join_by(gene_id)`
## Joining with `by = join_by(gene_id)`
str(sig_deg)
## List of 6
## $ bui: chr [1:87] "ENSXETP00000118428" "ENSXETP00000077066" "ENSXETP00000118788" "ENSXETP00000088743" ...
## $ can: chr [1:44] "ENSXETP00000108685" "ENSXETP00000056424" "ENSXETP00000026731" "ENSXETP00000066843" ...
## $ tur: chr [1:28] "ENSXETP00000109196" "ENSXETP00000105415" "ENSXETP00000110125" "ENSXETP00000090996" ...
## $ esp: chr [1:147] "ENSXETP00000042154" "ENSXETP00000095923" "ENSXETP00000038121" "ENSXETP00000058679" ...
## $ jab: chr [1:107] "ENSXETP00000079038" "ENSXETP00000116647" "ENSXETP00000089019" "ENSXETP00000086983" ...
## $ lla: chr [1:690] "ENSXETP00000042154" "ENSXETP00000095923" "ENSXETP00000090841" "ENSXETP00000013093" ...
There is some discussion about what makes a good background. Ideally, it should be the complete list of genes that could be differentially expressed. But what is this?
We will use the full set of genes that were returned by
DESeq2. This set should have filtered out genes that have
low counts (i.e. unlikely to be expressed across any of our
tissues/conditions).
We can use the same function from earlier to convert our list of Pelobates IDs to Xenopus peptide IDs.
# function to pull out xtr background IDs
extract_xtr<-function(x) {
return(
xtrop %>%
filter(gene_id %in% x) %>%
pull(xenp_pep_id) %>%
str_split(pattern=";") %>%
unlist() %>%
na.omit() %>%
unique()
)
}
xtr_bg<-lapply(res, FUN= function(x) extract_xtr(rownames(x)))
str(xtr_bg)
## List of 6
## $ bui: chr [1:12649] "ENSXETP00000113033" "ENSXETP00000042154" "ENSXETP00000095923" "ENSXETP00000041999" ...
## $ can: chr [1:12531] "ENSXETP00000113033" "ENSXETP00000042154" "ENSXETP00000095923" "ENSXETP00000041999" ...
## $ tur: chr [1:12854] "ENSXETP00000113033" "ENSXETP00000042154" "ENSXETP00000095923" "ENSXETP00000041999" ...
## $ esp: chr [1:12787] "ENSXETP00000113033" "ENSXETP00000042154" "ENSXETP00000095923" "ENSXETP00000041999" ...
## $ jab: chr [1:12608] "ENSXETP00000113033" "ENSXETP00000042154" "ENSXETP00000095923" "ENSXETP00000041999" ...
## $ lla: chr [1:12625] "ENSXETP00000113033" "ENSXETP00000042154" "ENSXETP00000095923" "ENSXETP00000041999" ...
# lets unify the background to use the same across all populations
xtr_bg_all<-xtr_bg %>%
unlist() %>%
unique()
We are now ready to go! There are a number of software and R packages that let you perform functional enrichment analysis. Here, we will use [g:Profiler])(https://biit.cs.ut.ee/gprofiler/), because it plays particularly well with R and with Ensembl gene/peptide annotations.
# set base url:
set_base_url("https://biit.cs.ut.ee/gprofiler_archive3/e109_eg56_p17/")
# run the analysis
res_ora<-gost(multi_query = FALSE, # returns separate results tables for multiquery
custom_bg = xtr_bg_all, # our background
query=sig_deg, # our list of gene sets
organism="xtropicalis", # the organism our annotations belong to
exclude_iea = FALSE, # include GO terms that were electronically assigned
correction_method = "gSCS", # the recommended multiple testing correction.
sources=c("GO:BP","GO:CC","GO:MF", "KEGG","REAC"), # the functional sets we are interested in
evcodes=FALSE, ## evcodes TRUE needed for downstream analysis like enrichment maps in Cytoscape, but this takes longer.
significant= FALSE) # return all terms, not just the significant ones
## Detected custom background input, domain scope is set to 'custom'.
# the results are stored as a "results" dataframe
head(res_ora$result)
## query significant p_value term_size query_size intersection_size
## 1 bui FALSE 0.05249863 27 84 5
## 2 bui FALSE 0.05249863 27 84 5
## 3 bui FALSE 0.05249863 27 84 5
## 4 bui FALSE 0.06506015 1867 84 31
## 5 bui FALSE 0.07414038 1777 84 30
## 6 bui FALSE 0.08410935 1889 84 31
## precision recall term_id source
## 1 0.05952381 0.18518519 GO:0002833 GO:BP
## 2 0.05952381 0.18518519 GO:0045089 GO:BP
## 3 0.05952381 0.18518519 GO:0031349 GO:BP
## 4 0.36904762 0.01660418 GO:0023052 GO:BP
## 5 0.35714286 0.01688239 GO:0007165 GO:BP
## 6 0.36904762 0.01641080 GO:0007154 GO:BP
## term_name effective_domain_size
## 1 positive regulation of response to biotic stimulus 12037
## 2 positive regulation of innate immune response 12037
## 3 positive regulation of defense response 12037
## 4 signaling 12037
## 5 signal transduction 12037
## 6 cell communication 12037
## source_order parents
## 1 1362 GO:00028....
## 2 11962 GO:00028....
## 3 7535 GO:00069....
## 4 7002 GO:0050789
## 5 2792 GO:00071....
## 6 2781 GO:0009987
We can use a p_value cutoff of 0.05 to see how many terms have been functionally enriched in each term.
res_ora$result %>%
filter(p_value<0.05) %>%
group_by(query) %>%
dplyr::count(query, sort=TRUE)
## # A tibble: 4 × 2
## # Groups: query [4]
## query n
## <chr> <int>
## 1 lla 18
## 2 bui 4
## 3 can 1
## 4 esp 1
gostplot(res_ora)
custom dot plot using the gprofiler results tables and ggplot.
res_ora$result %>%
select(query,term_name, p_value, intersection_size, query_size,source) %>%
filter(p_value<0.05) %>%
mutate(GeneRatio=intersection_size/query_size) %>%
arrange(GeneRatio) %>%
mutate(term_name = factor(term_name, levels=unique(term_name))) %>%
ggplot(aes(x=GeneRatio, y=term_name)) +
geom_point(aes(color=p_value, size=intersection_size)) +
ylab("") +
#scale_color_scico(palette = "batlow", direction = -1) +
facet_grid(source~query,scales = "free_y",space = "free") +
theme_bw()
No overlap between populations